Business Understanding
It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.
Dataset Description
The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
Objective: Identify fraudulent credit card transactions.
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# Any results you write to the current directory are saved as output.
# Import libraries that are necessary for visualizing purposes
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
data = pd.read_csv('creditcard.csv')
data.info()
data.isnull().sum()
We know that all features except amount and time are derived from principal components through pca and from initial inference there appears to be no missing values within the dataset
Let's check the target variable distribution denoted by label 'Class' in the dataset
data['Class'].value_counts()
# Let's print the proportion of target variable in the dataset
print('No Frauds', round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Frauds', round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset')
labels = ['Normal', 'Fraud']
values = data['Class'].value_counts()
colors = ['rgb(32, 148, 159)','#FEBFB3']
trace = go.Pie(labels=labels, values=values,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=20),
marker=dict(colors=colors,
line=dict(color='#000000', width=0.5)))
py.iplot([trace], filename='styled_pie_chart')
data.shape
data['Amount'].groupby(data['Class']).describe()
from sklearn.preprocessing import StandardScaler
data['normalizedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
from plotly import tools
Normal_amt = data['Amount'].loc[data['Class']==0]
fraud_amt = data['Amount'].loc[data['Class']==1]
normal_class =data['Class'].loc[data['Class']==0]
fraud_class = data['Class'].loc[data['Class']==1]
trace0 = go.Box(
y= Normal_amt,
x= normal_class,
name='Not Fraud',
marker=dict(
color='#3D9970'
)
)
trace1 = go.Box(
y= fraud_amt,
x= fraud_class,
name='Fraud',
marker=dict(
color='#FF4136'
)
)
trace2 = go.Box(
y= np.log(Normal_amt),
x= normal_class,
name='Not Fraud Log Amount',
marker=dict(
color= 'rgb(32, 148, 159)'
)
)
trace3 = go.Box(
y= np.log(fraud_amt),
x= fraud_class,
name='Fraud Log Amount ',
marker=dict(
color='#FEBFB3'
)
)
fig = tools.make_subplots(rows=2, cols=2, shared_yaxes=True)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 2, 2)
fig['layout'].update(height=700, width=800,
title='Box Plot Target Variable vs Transcation Amount')
py.iplot(fig, filename='multiple-subplots-shared-yaxes')
data.shape
data = data.drop(['Time','Amount'],axis=1)
data.head()
X = data.drop('Class', axis=1)
y = data['Class']
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skfold.split(X, y):
print("Train:", train_index, "Test:", test_index)
X_train,X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Turn into an array
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(y_train, return_counts=True)
test_unique_label, test_counts_label = np.unique(y_test, return_counts=True)
print('-' * 100)
print('Label Distributions: \n')
print('The training set distribution is {}'.format(train_counts_label/ len(y_train)))
print('The training set distribution is {}'.format(test_counts_label/ len(y_test)))
print('Length of X (train): {} | Length of y (train): {}'.format(len(X_train), len(y_train)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(X_test), len(y_test)))
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from pprint import pprint
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
# Intialize the random forest model with class_weights = balanced, since we have oversampled the training set
clf_rf = RandomForestClassifier(n_estimators =50,oob_score = False,
random_state=42,class_weight= {1:10}, n_jobs=-1)
print('Parameters currently in use:\n')
pprint(clf_rf.get_params())
# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
# Number of estimators
n_estimators = [30,50]
# Number of features to consider at every split
max_features = ['sqrt','log2']
# Maximum number of levels in tree
max_depth = [6, 9, 15]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth}
rand_rf = RandomizedSearchCV(clf_rf, param_distributions=random_grid, n_iter=4,
random_state=42, n_jobs=-1, return_train_score=False)
for train, test in skfold.split(X_train, y_train):
pipeline = imbalanced_make_pipeline(SMOTEENN(sampling_strategy='minority'), rand_rf)
# SMOTE happens during Cross Validation not before..
model = pipeline.fit(X_train[train], y_train[train])
best_est = rand_rf.best_estimator_
prediction = best_est.predict(X_train[test])
accuracy_lst.append(pipeline.score(X_train[test], y_train[test]))
precision_lst.append(precision_score(y_train[test], prediction))
recall_lst.append(recall_score(y_train[test], prediction))
f1_lst.append(f1_score(y_train[test], prediction))
auc_lst.append(roc_auc_score(y_train[test], prediction))
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)
print('The best estimator for random forest with SMOTEENN is {}'.format(best_est))
import pickle
rf_filename = 'rf_best_estimator.sav'
pickle.dump(best_est, open(rf_filename, 'wb'))
if os.path.isfile('rf_best_estimator.sav'):
clf_rf = pickle.load(open('rf_best_estimator.sav', 'rb'))
print("Fitted random forest model has been loaded from pickle file. Run prediction on dataset")
else:
print('Pickle object not found. Train the model and dump the fitted model using pickle')
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=200, replacement= True,max_features =0.3,
random_state=0,class_weight='balanced_subsample',max_depth = 25,
n_jobs=-1)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#print("Normalized confusion matrix")
else:
1#print('Confusion matrix, without normalization')
#print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
fig, ax = plt.subplots(ncols=1)
print('Balanced Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_brf),
geometric_mean_score(y_test, y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
plot_confusion_matrix(cm_brf, classes=np.unique(data['Class']),
title='Balanced random forest')
from sklearn.metrics import classification_report
labels = ['No Fraud', 'Fraud']
print(classification_report(y_test, y_pred_brf, target_names=labels))
from sklearn.metrics import average_precision_score
y_score = brf.predict_proba(X_test)[:,1]
average_precision = average_precision_score(y_test, y_score)
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
from sklearn.metrics import precision_recall_curve
fig = plt.figure(figsize=(12,6))
precision, recall, _ = precision_recall_curve(y_test, y_score)
plt.step(recall, precision, color='r', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='#F59B00')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('OverSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(
average_precision), fontsize=16)
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j = 1
for i in thresholds:
y_test_predictions_high_recall = y_score > i
plt.subplot(3,3,j)
j += 1
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_test_predictions_high_recall)
np.set_printoptions(precision=2)
print('Threshold {}'.format(i))
print("Precision metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[0,1]))
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = ['0','1']
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
features =['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28','normalizedAmount']
plt.figure(figsize = (9,5))
feat_import = pd.DataFrame({'Feature': features, 'Feature importance': brf.feature_importances_})
feat_import = feat_import.sort_values(by='Feature importance',ascending=False)
g = sns.barplot(x='Feature',y='Feature importance',data=feat_import)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set_title('Features importance - Random Forest',fontsize=20)
plt.show()
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
def make_model(n_features):
model = Sequential()
model.add(Dense(200, input_shape=(n_features,),
kernel_initializer='glorot_normal'))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(100, kernel_initializer='glorot_normal'))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(50, kernel_initializer='glorot_normal'))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(25, kernel_initializer='glorot_normal'))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
import time
from functools import wraps
def timeit(f):
@wraps(f)
def wrapper(*args, **kwds):
start_time = time.time()
result = f(*args, **kwds)
elapsed_time = time.time() - start_time
print('Elapsed computation time: {:.3f} secs'
.format(elapsed_time))
return (elapsed_time, result)
return wrapper
@timeit
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
model = make_model(X_train.shape[1])
print(model.summary())
model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=200)
y_pred = model.predict_proba(X_test, batch_size=200)[:,1]
return roc_auc_score(y_test, y_pred)
from imblearn.keras import BalancedBatchGenerator
@timeit
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
model = make_model(X_train.shape[1])
training_generator = BalancedBatchGenerator(X_train, y_train,
batch_size=200,
random_state=42)
model.fit_generator(generator=training_generator, epochs=5, verbose=1)
y_pred = model.predict_proba(X_test, batch_size=200)[:,1]
return roc_auc_score(y_test, y_pred)
from imblearn.keras import balanced_batch_generator
from imblearn.under_sampling import NeighbourhoodCleaningRule
model = make_model(X_train.shape[1])
skf = StratifiedKFold(n_splits=5)
for train_idx, valid_idx in skf.split(X_train, y_train):
X_local_train = X_train[train_idx]
y_local_train = y_train[train_idx]
X_local_test = X_train[valid_idx]
y_local_test = y_train[valid_idx]
training_generator = BalancedBatchGenerator(X_local_train, y_local_train,
batch_size=100,
random_state=42)
model.fit_generator(generator=training_generator, epochs=10,validation_data=(X_local_test, y_local_test) ,verbose=1)
training_generator, steps_per_epoch = balanced_batch_generator(X_local_train, y_local_train,
batch_size=100, sampler=NeighbourhoodCleaningRule(),random_state=42)
model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch,
epochs=20,validation_data= (X_local_test, y_local_test), verbose=1)
@timeit def fit_balanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator(X_train, y_train, batch_size=100, random_state=42) model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10, verbose=1) return model
y_pred_keras = model.predict_classes(X_test, batch_size=100, verbose=0)
y_pred_keras.shape
#y_pred_keras =keras_model.predict_classes(X_test)
labels = ['No Fraud', 'Fraud']
print(classification_report(y_test, y_pred_keras, target_names=labels))
cm_keras = confusion_matrix(y_test, y_pred_keras)
plot_confusion_matrix(cm_keras, classes=np.unique(data['Class']),
title='Keras Model')
from sklearn.metrics import average_precision_score
#y_keras_score = keras_model.predict_proba(X_test)[:,1]
y_keras_score = model.predict_proba(X_test, batch_size=100)
keras_average_precision = average_precision_score(y_test, y_keras_score)
print('Average precision-recall score: {0:0.2f}'.format(
keras_average_precision))
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j = 1
for i in thresholds:
y_keras_recall = y_keras_score > i
plt.subplot(3,3,j)
j += 1
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_keras_recall)
np.set_printoptions(precision=2)
print('Threshold {}'.format(i))
print("Precision metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[0,1]))
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = ['0','1']
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
fig = plt.figure(figsize=(12,6))
precision, recall, _ = precision_recall_curve(y_test, y_keras_score)
plt.step(recall, precision, color='r', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='#F59B00')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('OverSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(
keras_average_precision), fontsize=16)
from sklearn import metrics as metrics
from sklearn.metrics import matthews_corrcoef
def classifier_metrics(model,estimator,actual,y_pred,proba):
# Calculating the classification metrics -
# Input - model predicted values and probablilites
class_metrics ={
'Accuracy' : metrics.accuracy_score(actual, y_pred),
'Precision' : metrics.precision_score(actual, y_pred),
'Recall' : metrics.recall_score(actual, y_pred),
'F1 Score' : metrics.f1_score(actual, y_pred),
'ROC AUC' : metrics.roc_auc_score(actual, proba),
'Matthews Correlation Coefficient': matthews_corrcoef(actual,y_pred)
}
df_metrics = pd.DataFrame.from_dict(class_metrics, orient='index')
df_metrics.columns = [model]
print('\n'+ model +' Metrics:')
print(df_metrics)
return df_metrics
classifier_metrics('keras',model,y_test, y_pred_keras, y_keras_score)
print(model.summary())